{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using hgvs\n",
    "This notebook demonstrates major features of the hgvs package."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.5.0a6.dev3+nf998c16a46b3.d20161012'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import hgvs\n",
    "hgvs.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variant I/O"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Initialize the parser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# You only need to do this once per process\n",
    "import hgvs.parser\n",
    "hp = hgvsparser = hgvs.parser.Parser()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parse a simple variant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "v = hp.parse_hgvs_variant(\"NC_000007.13:g.21726874G>A\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000007.13, type=g, posedit=21726874G>A)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('NC_000007.13', 'g')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.ac, v.type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PosEdit(pos=21726874, edit=G>A, uncertain=False)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.posedit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Interval(start=21726874, end=21726874, uncertain=False)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.posedit.pos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SimplePosition(base=21726874, uncertain=False)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.posedit.pos.start"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parsing complex variants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "v = hp.parse_hgvs_variant(\"NM_003777.3:c.13552_*36del57\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(BaseOffsetPosition(base=13552, offset=0, datum=1, uncertain=False),\n",
       " BaseOffsetPosition(base=36, offset=0, datum=2, uncertain=False))"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.posedit.pos.start, v.posedit.pos.end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NARefAlt(ref=57, alt=None, uncertain=False)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v.posedit.edit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Formatting variants\n",
    "All objects may be formatted simply by \"stringifying\" or printing them using `str`, `print()`, or `\"\".format()`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NM_003777.3:c.13552_*36del57'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NM_003777.3:c.13552_*36del57\n"
     ]
    }
   ],
   "source": [
    "print(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NM_003777.3:c.13552_*36del57 spans the CDS end'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"{v} spans the CDS end\".format(v=v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Projecting variants between sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set up a dataprovider\n",
    "\n",
    "Mapping variants requires exon structures, alignments, CDS bounds, and raw sequence. These are provided by a `hgvs.dataprovider` instance. The only dataprovider provided with hgvs uses UTA. You may write your own by subsclassing hgvs.dataproviders.interface."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import hgvs.dataproviders.uta\n",
    "hdp = hgvs.dataproviders.uta.connect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Initialize mapper classes\n",
    "The VariantMapper class projects variants between two sequence accessions using alignments from a specified source. In order to use it, you must know that two sequences are aligned. VariantMapper isn't demonstrated here.\n",
    "\n",
    "AssemblyMapper builds on VariantMapper and handles identifying appropriate sequences. It is configured for a particular genome assembly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import hgvs.variantmapper\n",
    "#vm = variantmapper = hgvs.variantmapper.VariantMapper(hdp)\n",
    "am37 = easyvariantmapper = hgvs.variantmapper.AssemblyMapper(hdp, assembly_name='GRCh37')\n",
    "am38 = easyvariantmapper = hgvs.variantmapper.AssemblyMapper(hdp, assembly_name='GRCh38')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### c_to_g\n",
    "This is the easiest case because there is typically only one alignment between a transcript and the genome. (Exceptions exist for pseudoautosomal regions.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "var_c = hp.parse_hgvs_variant(\"NM_015120.4:c.35G>C\")\n",
    "var_g = am37.c_to_g(var_c)\n",
    "var_g"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "am38.c_to_g(var_c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### g_to_c\n",
    "In order to project a genomic variant onto a transcript, you must tell the AssemblyMapper which transcript to use."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['NM_015120.4']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am37.relevant_transcripts(var_g)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NM_015120.4, type=c, posedit=35T>C)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am37.g_to_c(var_g, \"NM_015120.4\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### c_to_p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NP_055935.4:p.(Leu12Pro)'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "var_p = am37.c_to_p(var_c)\n",
    "str(var_p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NP_055935.4:p.Leu12Pro'"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "var_p.posedit.uncertain = False\n",
    "str(var_p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Projecting in the presence of a genome-transcript gap\n",
    "As of Oct 2016, 1033 RefSeq transcripts in 433 genes have gapped alignments. These gaps require special handlingin order to maintain the correspondence of positions in an alignment. hgvs uses the precomputed alignments in UTA to correctly project variants in exons containing gapped alignments. \n",
    "\n",
    "This example demonstrates projecting variants in the presence of a gap in the alignment of NM_015120.4 (ALMS1) with GRCh37 chromosome 2. (The alignment with GRCh38 is similarly gapped.) Specifically, the adjacent genomic positions 73613031 and 73613032 correspond to the non-adjacent CDS positions 35 and 39.\n",
    "\n",
    "```\n",
    " NM_015120.4  c         15 >                           >       58\n",
    " NM_015120.4  n        126 > CCGGGCGAGCTGGAGGAGGAGGAG  >      169\n",
    "                             |||||||||||   ||||||||||  21=3I20= \n",
    " NC_000002.11 g   73613021 > CCGGGCGAGCT---GGAGGAGGAG  > 73613041\n",
    " NC_000002.11 g   73613021 < GGCCCGCTCGA---CCTCCTCCTC  < 73613041                                                  \n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NC_000002.11:g.73613031T>C'"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str(am37.c_to_g(hp.parse_hgvs_variant(\"NM_015120.4:c.35G>C\")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NC_000002.11:g.73613032G>C'"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str(am37.c_to_g(hp.parse_hgvs_variant(\"NM_015120.4:c.39G>C\")))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Normalizing variants\n",
    "In hgvs, normalization means shifting variants 3' (as requried by the HGVS nomenclature) as well as rewriting variants. The variant \"NM_001166478.1:c.30_31insT\" is in a poly-T run (on the transcript). It should be shifted 3' and is better written as dup, as shown below:\n",
    "```\n",
    "                                         *                       NC_000006.11:g.49917127dupA\n",
    "   NC_000006.11 g   49917117 > AGAAAGAAAAATAAAACAAAG  > 49917137 \n",
    "   NC_000006.11 g   49917117 < TCTTTCTTTTTATTTTGTTTC  < 49917137 \n",
    "                               |||||||||||||||||||||  21= \n",
    " NM_001166478.1 n         41 < TCTTTCTTTTTATTTTGTTTC  <       21 NM_001166478.1:n.35dupT\n",
    " NM_001166478.1 c         41 <                        <       21 NM_001166478.1:c.30_31insT\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import hgvs.normalizer\n",
    "hn = hgvs.normalizer.Normalizer(hdp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'NM_001166478.1:c.35dupT'"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v = hp.parse_hgvs_variant(\"NM_001166478.1:c.30_31insT\")\n",
    "str(hn.normalize(v))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## A more complex normalization example\n",
    "This example is based on https://github.com/biocommons/hgvs/issues/382/.\n",
    "\n",
    "```\n",
    "   NC_000001.11 g   27552104 > CTTCACACGCATCCTGACCTTG > 27552125\n",
    "   NC_000001.11 g   27552104 < GAAGTGTGCGTAGGACTGGAAC < 27552125\n",
    "                               |||||||||||||||||||||| 22= \n",
    " NM_001029882.3 n        843 < GAAGTGTGCGTAGGACTGGAAC <      822 \n",
    " NM_001029882.3 c         12 <                        <      -10 \n",
    "                                         ^^  \n",
    "                                         NM_001029882.3:c.1_2del\n",
    "                                         NM_001029882.3:n.832_833delAT\n",
    "                                         NC_000001.11:g.27552114_27552115delAT\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000001.11, type=g, posedit=27552115T>C)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am38.c_to_g(hp.parse_hgvs_variant(\"NM_001029882.3:c.1A>G\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000001.11, type=g, posedit=27552114A>C)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am38.c_to_g(hp.parse_hgvs_variant(\"NM_001029882.3:c.2T>G\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000001.11, type=g, posedit=27552114_27552115delAT)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am38.c_to_g(hp.parse_hgvs_variant(\"NM_001029882.3:c.1_2del\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The genomic coordinates for the SNVs at c.1 and c.2 match those for the del at c.1_2. Good!\n",
    "\n",
    "Now, notice what happens with c.1_3del, c.1_4del, and c.1_5del:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000001.11, type=g, posedit=27552114_27552116delATC)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am38.c_to_g(hp.parse_hgvs_variant(\"NM_001029882.3:c.1_3del\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000001.11, type=g, posedit=27552112_27552115delGCAT)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am38.c_to_g(hp.parse_hgvs_variant(\"NM_001029882.3:c.1_4del\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SequenceVariant(ac=NC_000001.11, type=g, posedit=27552112_27552116delGCATC)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "am38.c_to_g(hp.parse_hgvs_variant(\"NM_001029882.3:c.1_5del\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Explanation:\n",
    "\n",
    "On the transcript, c.1_2delAT deletes AT from …AGGATGCG…, resulting in …AGGGCG…. There's no ambiguity about what sequence was actually deleted.\n",
    "\n",
    "c.1_3delATG deletes ATG, resulting in …AGGCG…. Note that you could also get this result by deleting GAT. This is an example of an indel that is subject to normalization and hgvs does this.\n",
    "\n",
    "c.1_4delATGC and 1_5delATGCG have similar behaviors.\n",
    "\n",
    "Normalization is always 3' with respect to the reference sequence. So, after projecting from a - strand transcript to the genome, normalization will go in the opposite direction to the transcript. It will have roughly the same effect as being 5' shifted on the transcript (but revcomp'd).\n",
    "\n",
    "For more precise control, see the `normalize` and `replace_reference` options of `AssemblyMapper`.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validating variants\n",
    "`hgvs.validator.Validator` is a composite of two classes, `hgvs.validator.IntrinsicValidator` and `hgvs.validator.ExtrinsicValidator`. Intrinsic validation evaluates a given variant for *internal* consistency, such as requiring that insertions specify adjacent positions.  Extrinsic validation evaluates a variant using external data, such as ensuring that the reference nucleotide in the variant matches that implied by the reference sequence and position. Validation returns `True` if successful, and raises an exception otherwise. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import hgvs.validator\n",
    "hv = hgvs.validator.Validator(hdp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hv.validate(hp.parse_hgvs_variant(\"NM_001166478.1:c.30_31insT\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "insertion length must be 1\n"
     ]
    }
   ],
   "source": [
    "from hgvs.exceptions import HGVSError\n",
    "\n",
    "try:\n",
    "    hv.validate(hp.parse_hgvs_variant(\"NM_001166478.1:c.30_32insT\"))\n",
    "except HGVSError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11+"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
